import pandas as pd
import plotly as plt
import numpy as np
pd.options.display.max_rows = 4000
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import plotly.express as px
import plotly.subplots as tls
import plotly
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV, StratifiedShuffleSplit
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
SEED = 42
from io import StringIO
import re
for_pd = StringIO()
with open("data/kickstarter/ks-projects-201612.csv") as data_file:
for line in data_file:
new_line = re.sub(r', ', ';', line.rstrip(), count=17)
print (new_line, file=for_pd)
for_pd.seek(0)
df = pd.read_csv(for_pd, sep=',',low_memory=False)
df = df.rename(columns=lambda x: x.strip())
df=df.dropna(axis=1,how='all')
df2 = df[df["Unnamed: 13"].notnull()].copy(deep=True)
df = df[df["Unnamed: 13"].isnull()]
df=df.dropna(axis=1,how='all')
df.head()
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000002330 | The Songs of Adelaide & Abullah | Poetry | Publishing | GBP | 2015-10-09 11:36:00 | 1000 | 2015-08-11 12:12:28 | 0 | failed | 0 | GB | 0 |
| 1 | 1000004038 | Where is Hank? | Narrative Film | Film & Video | USD | 2013-02-26 00:20:50 | 45000 | 2013-01-12 00:20:50 | 220 | failed | 3 | US | 220 |
| 2 | 1000007540 | ToshiCapital Rekordz Needs Help to Complete Album | Music | Music | USD | 2012-04-16 04:24:11 | 5000 | 2012-03-17 03:24:11 | 1 | failed | 1 | US | 1 |
| 3 | 1000011046 | Community Film Project: The Art of Neighborhoo... | Film & Video | Film & Video | USD | 2015-08-29 01:00:00 | 19500 | 2015-07-04 08:35:03 | 1283 | canceled | 14 | US | 1283 |
| 4 | 1000014025 | Monarch Espresso Bar | Restaurants | Food | USD | 2016-04-01 13:38:27 | 50000 | 2016-02-26 13:38:27 | 52375 | successful | 224 | US | 52375 |
The dataframe has 323750 rows
df2.count()
ID 34 name 34 category 29 main_category 34 currency 34 deadline 34 goal 34 launched 34 pledged 34 state 34 backers 34 country 34 usd pledged 34 Unnamed: 13 34 dtype: int64
df.describe()
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 323716 | 323712 | 323716 | 323716 | 323716 | 323716 | 323716 | 323716 | 323716 | 323716 | 323716 | 323716 | 319879 |
| unique | 323716 | 321577 | 158 | 16 | 53 | 295323 | 7618 | 323220 | 55019 | 34 | 3583 | 54 | 94483 |
| top | 537793224 | New EP/Music Development | Product Design | Film & Video | USD | 2012-01-01 05:59:00 | 5000 | 1970-01-01 01:00:00 | 0 | failed | 0 | US | 0 |
| freq | 1 | 41 | 17503 | 57733 | 260752 | 47 | 25570 | 7 | 45873 | 168564 | 48955 | 258014 | 50553 |
df[df['name'].isnull()]
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 142830 | 1848699072 | NaN | Narrative Film | Film & Video | USD | 2012-02-29 15:04:00 | 200000 | 2012-01-01 12:35:31 | 100 | failed | 1 | US | 100 |
| 262901 | 634871725 | NaN | Video Games | Games | GBP | 2013-01-06 23:00:00 | 2000 | 2012-12-19 23:57:48 | 196 | failed | 12 | GB | 317.7284362 |
| 265251 | 648853978 | NaN | Product Design | Design | USD | 2016-07-18 05:01:47 | 2500 | 2016-06-18 05:01:47 | 0 | suspended | 0 | US | 0 |
| 289847 | 796533179 | NaN | Painting | Art | USD | 2011-12-05 05:59:00 | 35000 | 2011-11-06 23:55:55 | 220 | failed | 5 | US | 220 |
df.dropna(subset = ["name"], inplace=True)
df.isnull().any()
ID False name False category False main_category False currency False deadline False goal False launched False pledged False state False backers False country False usd pledged True dtype: bool
df['usd pledged'] = np.where(df['main_category'] == 'USD', df['country'], df['usd pledged'])
df['country'] = np.where(df['main_category'] == 'USD', df['backers'], df['country'])
df['backers'] = np.where(df['main_category'] == 'USD', df['state'], df['backers'])
df['state'] = np.where(df['main_category'] == 'USD', df['pledged'], df['state'])
df['pledged'] = np.where(df['main_category'] == 'USD', df['launched'], df['pledged'])
df['launched'] = np.where(df['main_category'] == 'USD', df['goal'], df['launched'])
df['goal'] = np.where(df['main_category'] == 'USD', df['deadline'], df['goal'])
df['deadline'] = np.where(df['main_category'] == 'USD', df['currency'], df['deadline'])
df['currency'] = np.where(df['main_category'] == 'USD', df['main_category'], df['currency'])
df['tmp'] = df['category']
df['category'] = np.where(df['main_category'] == 'USD', df['name'], df['category'])
df['name'] = np.where(df['main_category'] == 'USD', df['ID'].str.split(';').str[-1], df['name'])
df['ID'] = np.where(df['main_category'] == 'USD', df['ID'].str.split(';').str[0], df['ID'])
df[df['main_category']=='USD']
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | tmp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6262 | 1037047712 | Second Life Second Chance (my very first album ) | Pop | USD | USD | 2010-07-17 19:06:00 | 4000 | 2010-06-22 12:34:36 | 953 | failed | 19 | US | 953 | Music |
| 11153 | 1065173583 | Help promote Strangefruit! | Fashion | USD | USD | 2009-09-21 05:43:00 | 1500 | 2009-08-28 18:16:11 | 0 | failed | 0 | US | 0 | Fashion |
| 13225 | 1077797875 | THE TORCH MUST KEEP BURNING! FUND THE NEW RECO... | Indie Rock | USD | USD | 2011-02-11 19:19:44 | 6000 | 2010-12-13 19:19:44 | 1181 | failed | 9 | US | 1181 | Music |
| 26156 | 1154862705 | Let's finish Katie Moore's new album together! | Country & Folk | USD | USD | 2010-12-02 05:40:36 | 3000 | 2010-10-30 05:40:36 | 4581 | successful | 87 | US | 4581 | Music |
| 33867 | 1201238408 | The Living Wetlands | Film & Video | USD | USD | 2011-05-25 12:19:50 | 500 | 2011-02-23 12:19:50 | 45 | failed | 2 | US | 45 | Film & Video |
| 34817 | 1206936256 | SAVE Blue Like Jazz! (the movie) | Narrative Film | USD | USD | 2010-10-26 05:59:00 | 125000 | 2010-09-24 23:18:05 | 345992.47 | successful | 4495 | US | 345992.47 | Film & Video |
| 40953 | 1243737397 | 'A Taste of Happiness' a new CD of songs by ... | Music | USD | USD | 2011-04-06 04:26:49 | 4000 | 2011-02-24 04:26:49 | 4451 | successful | 78 | US | 4451 | Music |
| 45947 | 1273172245 | SHAKTI! | Narrative Film | USD | USD | 2010-12-16 17:47:59 | 5000 | 2010-11-16 17:47:59 | 5117 | successful | 58 | US | 5117 | Film & Video |
| 60526 | 1359752672 | A New Sudan Through War or Peace | Photography | USD | USD | 2010-12-20 06:00:00 | 5000 | 2010-11-04 17:44:42 | 2001 | failed | 25 | US | 2001 | Photography |
| 71086 | 1422544453 | An Artist in the ARCTIC RESIDENCY | Photography | USD | USD | 2010-12-06 09:29:27 | 1300 | 2010-11-06 08:29:27 | 1340 | successful | 16 | US | 1340 | Photography |
| 75688 | 14498203 | 11 24: A Short Film About Collecting Cans and ... | Shorts | USD | USD | 2011-04-25 21:17:35 | 1500 | 2011-01-24 21:17:35 | 1900 | successful | 40 | US | 1900 | Film & Video |
| 83402 | 1495194447 | Murder Mystery Theatre Video Project | Theater | USD | USD | 2011-03-20 04:11:03 | 600 | 2011-02-18 04:11:03 | 755 | successful | 18 | US | 755 | Theater |
| 94718 | 1562731910 | The Genesis of images by Scott | Photography | USD | USD | 2011-03-08 12:15:02 | 550 | 2010-12-08 12:15:02 | 25 | failed | 1 | US | 25 | Photography |
| 110033 | 165319510 | 15 A Feature Length Film (Canceled) | Narrative Film | USD | USD | 2011-05-12 05:00:00 | 3600 | 2011-04-11 06:44:40 | 144 | canceled | 10 | US | 144 | Film & Video |
| 116504 | 1691556823 | Suzanne Carrico's Debut Recording - What Chris... | Music | USD | USD | 2010-08-31 23:07:00 | 5000 | 2010-07-26 23:52:25 | 5551 | successful | 65 | US | 5551 | Music |
| 140462 | 1834923380 | Youth Choir fromSouth Africa needs a new home! | World Music | USD | USD | 2010-11-23 20:27:20 | 35000 | 2010-10-24 20:27:20 | 0 | failed | 0 | US | 0 | Music |
| 141128 | 183917362 | BROTHER LIAR JOHN - A Feature Film | Shorts | USD | USD | 2011-03-18 22:00:00 | 4500 | 2011-02-01 13:56:26 | 131 | failed | 5 | US | 131 | Film & Video |
| 145290 | 1863308697 | L'Amore della Musica Quintet's Concert Series ... | Classical Music | USD | USD | 2011-03-21 18:58:31 | 5000 | 2011-02-19 19:58:31 | 5551 | successful | 66 | US | 5551 | Music |
| 146916 | 1872898319 | Perishable Realities : Artist Exhibiton Funding | Art | USD | USD | 2010-12-13 23:05:13 | 3500 | 2010-10-14 23:05:13 | 10 | failed | 1 | US | 10 | Art |
| 152208 | 1904422702 | 13th Dream | Film & Video | USD | USD | 2011-01-19 23:15:24 | 5000 | 2010-10-21 23:15:24 | 25 | failed | 1 | US | 25 | Film & Video |
| 152732 | 1907771986 | Fund A Mike Kuchar Film: Filmmaker Founded Un... | Film & Video | USD | USD | 2010-08-13 02:16:00 | 600 | 2010-07-15 02:41:12 | 6 | failed | 2 | US | 6 | Film & Video |
| 164754 | 1979723427 | Music Program for the Under Privileged | Music | USD | USD | 2010-11-19 05:17:29 | 3000 | 2010-10-20 05:17:29 | 0 | failed | 0 | US | 0 | Music |
| 164790 | 1979950922 | Deliver Flow Tribe's Next Funk Baby! | Music | USD | USD | 2011-03-13 16:12:19 | 2000 | 2011-02-11 17:12:19 | 5135 | successful | 38 | US | 5135 | Music |
| 165475 | 1984073278 | Essentials of Flor | Dance | USD | USD | 2011-02-01 18:00:00 | 2500 | 2010-12-30 21:40:45 | 2980 | successful | 23 | US | 2980 | Dance |
| 166067 | 1987910949 | White Buffalo Stands: Sacred Songs to Benefit ... | Music | USD | USD | 2010-10-19 22:08:41 | 13500 | 2010-09-16 22:08:41 | 885 | failed | 9 | US | 885 | Music |
| 184709 | 2099851565 | Expanding My Handmade Jewelry Collection! | Fashion | USD | USD | 2011-04-13 08:11:00 | 5000 | 2011-03-06 20:42:34 | 5150 | successful | 72 | US | 5150 | Fashion |
| 189538 | 2128520745 | IT’S A HOT CAPPUCCINO NIGHT | Fiction | USD | USD | 2010-12-05 00:46:07 | 2000 | 2010-11-04 23:46:07 | 20 | failed | 1 | US | 20 | Publishing |
| 206036 | 294371724 | A Solar Powered Music Project: Artist Residenc... | World Music | USD | USD | 2010-12-16 11:04:40 | 4000 | 2010-11-06 10:04:40 | 635 | failed | 17 | US | 635 | Music |
| 209992 | 317893557 | Native Americans Documentary | Documentary | USD | USD | 2010-03-15 19:52:00 | 5000 | 2009-12-15 22:39:59 | 135 | failed | 3 | US | 135 | Film & Video |
| 247705 | 542758877 | Recording of Wunmi's dream Album: Ghana or bust! | World Music | USD | USD | 2010-02-08 05:59:00 | 15500 | 2009-11-23 05:22:29 | 17405 | successful | 81 | US | 17405 | Music |
| 251867 | 56795857 | Gospel plays at the DC Black Theatre Festival! | Theater | USD | USD | 2011-06-06 01:19:43 | 1000 | 2011-03-07 01:19:43 | 25 | failed | 1 | US | 25 | Theater |
| 253102 | 575325959 | We promise once you have bit of Poundtastic | Food | USD | USD | 2011-05-28 01:44:52 | 10000 | 2011-02-26 01:44:52 | 0 | failed | 0 | US | 0 | Food |
| 261877 | 628401820 | Starving Artist Only Hungry for His Art | Hip-Hop | USD | USD | 2011-01-07 18:36:43 | 6000 | 2010-11-23 18:36:43 | 0 | failed | 0 | US | 0 | Music |
| 268592 | 668533160 | Visions of Johanne - A photo essay | Photography | USD | USD | 2010-06-02 05:23:00 | 3000 | 2010-04-19 16:58:54 | 3070 | successful | 30 | US | 3070 | Photography |
| 273447 | 697713591 | Movie Inspired by Actual Events | Documentary | USD | USD | 2010-09-10 22:31:31 | 25000 | 2010-08-11 22:31:31 | 0 | failed | 0 | US | 0 | Film & Video |
| 279680 | 735066238 | In Search of the Miraculous | Photography | USD | USD | 2011-02-15 08:59:00 | 3900 | 2011-01-11 02:38:53 | 4540.99 | successful | 111 | US | 4540.99 | Photography |
| 287438 | 781972011 | LONGEVITY -A PHOTO EXPLORATION INTO LOVE (FRANCE) | Photography | USD | USD | 2009-11-16 05:59:00 | 4000 | 2009-10-04 19:12:44 | 95 | failed | 4 | US | 95 | Photography |
| 291705 | 807628947 | Edgar White Recording Project | Indie Rock | USD | USD | 2011-04-08 20:50:59 | 2500 | 2011-02-21 20:50:59 | 2906 | successful | 35 | US | 2906 | Music |
| 292593 | 813070034 | The Prolific Needs Your Help To Create Their D... | Rock | USD | USD | 2010-10-17 07:06:24 | 1200 | 2010-09-02 07:06:24 | 300 | failed | 5 | US | 300 | Music |
| 296204 | 834654830 | Send EVOLVE LOVE Team to film the Climate Summ... | Documentary | USD | USD | 2010-11-30 05:59:00 | 25000 | 2010-11-09 05:53:00 | 27871.88 | successful | 208 | US | 27871.88 | Film & Video |
df['main_category'] = np.where(df['main_category'] == 'USD', df['tmp'], df['main_category'])
df = df.drop(columns=['tmp'])
df.describe()
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 323712 | 323712 | 323712 | 323712 | 323712 | 323712 | 323712 | 323712 | 323712 | 323712 | 323712 | 323712 | 319915 |
| unique | 323712 | 321600 | 158 | 15 | 13 | 295330 | 7578 | 323224 | 55020 | 6 | 3583 | 22 | 94486 |
| top | 537793224 | New EP/Music Development | Product Design | Film & Video | USD | 2012-01-01 05:59:00 | 5000 | 1970-01-01 01:00:00 | 0 | failed | 0 | US | 0 |
| freq | 1 | 41 | 17502 | 57743 | 260789 | 47 | 25577 | 7 | 45878 | 168583 | 48960 | 258051 | 50558 |
df['country'] = df['country'].replace(['N,"0'],[None])
df['country'].value_counts(dropna=False)
US 258051 GB 27568 CA 12007 AU 6251 NaN 3797 DE 2684 NL 2265 FR 1911 IT 1750 ES 1373 SE 1271 NZ 1137 DK 825 IE 576 NO 526 CH 471 BE 402 AT 377 MX 214 SG 119 HK 97 LU 40 Name: country, dtype: int64
df.dtypes
ID object name object category object main_category object currency object deadline object goal object launched object pledged object state object backers object country object usd pledged object dtype: object
df.isnull().any()
ID False name False category False main_category False currency False deadline False goal False launched False pledged False state False backers False country True usd pledged True dtype: bool
df2.isnull().any()
ID False name False category True main_category False currency False deadline False goal False launched False pledged False state False backers False country False usd pledged False Unnamed: 13 False dtype: bool
df2.head()
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | Unnamed: 13 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 32886 | 1195071984 | The Polyamorous Socks - It's always a good match | ) | Product Design | Design | SGD | 2016-12-01 01:00:00 | 5000 | 2016-11-01 16:15:07 | 3300 | failed | 72 | SG | 2920.061449 |
| 33513 | 1199073018 | Bear Witness- Too Cool For School Bus Tour (~) | } (Canceled) | Hip-Hop | Music | USD | 2015-09-24 11:47:36 | 50000 | 2015-08-25 11:47:36 | 0 | canceled | 0 | US | 0.000000 |
| 36188 | 1215141595 | The 1st Fun Children Books About How To Love O... | ) | Children's Books | Publishing | EUR | 2016-11-09 17:34:17 | 4000 | 2016-10-10 17:34:17 | 4287 | successful | 158 | FR | 2275.247857 |
| 36671 | 1218074363 | I am Rupert | NaN | Webseries | Film & Video | GBP | 2014-05-16 22:25:57 | 2000 | 2014-03-27 21:25:57 | 5 | failed | 1 | GB | 8.267797 |
| 41069 | 124438738 | BlanketPals TM | NaN | Interactive Design | Design | USD | 2015-03-25 20:51:48 | 9750 | 2015-02-23 21:51:48 | 10890.45 | successful | 107 | US | 10890.450000 |
df2['tmp'] = df2['category']
df2['category'] = df2['main_category']
df2['main_category'] = df2['currency']
df2['currency'] = df2["deadline"]
df2["deadline"] = df2["goal"]
df2["goal"] = df2["launched"]
df2["launched"] = df2["pledged"]
df2['pledged'] = df2['state']
df2["state"] = df2["backers"]
df2["backers"] = df2["country"]
df2["country"] = df2["usd pledged"]
df2["usd pledged"] = df2["Unnamed: 13"]
df2['name'] = np.where(~df2['tmp'].isnull(), df2['name']+df2['tmp'], df2['name'])
df2 = df2.drop(columns=['tmp', 'Unnamed: 13'])
df2.isnull().any()
ID False name False category False main_category False currency False deadline False goal False launched False pledged False state False backers False country False usd pledged False dtype: bool
df2.state.value_counts(dropna=False)
failed 22 canceled 6 successful 6 Name: state, dtype: int64
df = df.append(df2, ignore_index=True)
df.describe()
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 323746 | 323746 | 323746 | 323746 | 323746 | 323746 | 323746 | 323746 | 323746 | 323746 | 323746 | 319949 | 319949 |
| unique | 323746 | 321634 | 158 | 15 | 13 | 295360 | 7578 | 323258 | 55023 | 6 | 3583 | 21 | 94516 |
| top | 537793224 | New EP/Music Development | Product Design | Film & Video | USD | 2012-01-01 05:59:00 | 5000 | 1970-01-01 01:00:00 | 0 | failed | 0 | US | 0 |
| freq | 1 | 41 | 17505 | 57745 | 260814 | 47 | 25580 | 7 | 45882 | 168605 | 48964 | 258076 | 50558 |
We end up with a dataframe of 323746 rows out of the initial 323750 rows, considering we have removed 4 rows which had empty project names, which is coherent. IDs are unique, which is good.
df[df["country"].isnull()].shape
(3797, 13)
df[df["usd pledged"].isnull()].shape
(3797, 13)
df[df['usd pledged'].isnull()].all() == df[df['country'].isnull()].all()
ID True name True category True main_category True currency True deadline True goal True launched True pledged True state True backers True country True usd pledged True dtype: bool
We can conclude that there are 3797 rows with both country and usd pledged set to Null.
# Converting the columns into the right dtypes as for dates and numbers.
df["deadline"] = pd.to_datetime(df['deadline'])
df["launched"] = pd.to_datetime(df['launched'])
df["ID"] = pd.to_numeric(df["ID"])
df["goal"] = pd.to_numeric(df["goal"])
df["pledged"] = pd.to_numeric(df["pledged"])
df["backers"] = pd.to_numeric(df["backers"])
df["usd pledged"] = pd.to_numeric(df["usd pledged"])
df.dtypes
ID int64 name object category object main_category object currency object deadline datetime64[ns] goal float64 launched datetime64[ns] pledged float64 state object backers int64 country object usd pledged float64 dtype: object
Let's explore the dataset by constructing graphs in order to get more insight about the features and characteristics of a successful project.
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.express as px
fig = px.bar(df["state"].value_counts().rename_axis('state').reset_index(name='counts'), y="state", x="counts", color="state", orientation="h", hover_name="counts",
color_discrete_sequence= px.colors.qualitative.Plotly,
title="State-counts plot"
)
fig.show()
import plotly.express as px
fig = px.pie(df["state"].value_counts(dropna=False).rename_axis('state').reset_index(name='counts'), values='counts', names='state', title='Repartition of states amongst Kickstarters projects')
fig.show()
The successful and failed projects make up 87% of the whole projects states.
Knowing we want to study successful/failed projects (that's the majority of them, as we can see), I am going to drop the rows where the status is something else. We thus end up with a binary target variable that we will want to predict in our model.
df_final = df[(df["state"] == "failed") | (df["state"] == "successful")].copy()
import plotly.express as px
fig = px.pie(df_final["state"].value_counts(dropna=False).rename_axis('state').reset_index(name='counts'), values='counts', names='state', title='Repartition of states amongst Kickstarters projects')
fig.show()
60/40 distribusion is fine so we can consider the class balanced.
df_final.isnull().any()
ID False name False category False main_category False currency False deadline False goal False launched False pledged False state False backers False country True usd pledged True dtype: bool
Let's investigate if the feature usd pledged is correctly based off from the feature pledged
df_final[["currency",'usd pledged', 'pledged']]
| currency | usd pledged | pledged | |
|---|---|---|---|
| 0 | GBP | 0.000000 | 0.0 |
| 1 | USD | 220.000000 | 220.0 |
| 2 | USD | 1.000000 | 1.0 |
| 4 | USD | 52375.000000 | 52375.0 |
| 5 | USD | 1205.000000 | 1205.0 |
| ... | ... | ... | ... |
| 323739 | GBP | 83.408540 | 50.0 |
| 323740 | GBP | 80.645161 | 52.0 |
| 323742 | USD | 850.000000 | 850.0 |
| 323743 | USD | 20.000000 | 20.0 |
| 323744 | USD | 545.000000 | 545.0 |
281853 rows × 3 columns
(df_final[df_final["currency"]=='USD'][df_final["pledged"]==df_final["usd pledged"]].shape[0]*100)/(df_final[df_final["currency"]=='USD'].shape[0])
<ipython-input-42-e7548e3a79ea>:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
94.85552777475813
We can see here that 5% (which is a lot of records as the majority of our dataset is with currency set to USD - see figure below) of usd pledged is not in accordance with pledged when the currency chosen is USD
import plotly.express as px
fig = px.pie(df_final["currency"].value_counts(dropna=False).rename_axis('currency').reset_index(name='counts'), values='counts', names='currency', title='Repartition of currencies amongst projects')
fig.show()
df_final[df_final["pledged"]==df_final["usd pledged"]][df_final["pledged"]!=0].currency.value_counts()
<ipython-input-44-bf9449f4f9b2>:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
USD 192617 GBP 1 Name: currency, dtype: int64
#df_final[df_final["pledged"]==df_final["real_usd_pledged"]][df_final["pledged"]!=0].currency.value_counts()
df_final[df_final["pledged"]==df_final["usd pledged"]][df_final["pledged"]!=0][df_final["currency"]=="GBP"]
<ipython-input-46-3a8822307e07>:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 297576 | 843339380 | TARDIS: The Edge of Space Project II | Space Exploration | Technology | GBP | 2012-11-18 00:33:00 | 100.0 | 2012-10-31 21:43:18 | 213.0 | successful | 19 | GB | 213.0 |
There seems to be discrepancies regarding how the usd pledged feature has been calculated. We choose to recalculate it based on the column pledged by using a Python library to make the conversion into USD based on the campaign end_date and the project currency. We will also add usd_goal to have all the amounts in the same currency.
df_final = pd.read_csv('out2.zip')
df_final = df_final.drop(['goal', 'pledged', 'usd pledged'], axis = 1)
df_final.describe(include='all')
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.818530e+05 | 281853 | 281853 | 281853 | 281853 | 281853 | 281853 | 281853 | 281853.000000 | 281643 | 2.818530e+05 | 2.818530e+05 |
| unique | NaN | 280147 | 158 | 15 | 13 | 257784 | 281499 | 2 | NaN | 21 | NaN | NaN |
| top | NaN | New EP/Music Development | Product Design | Film & Video | USD | 2012-01-01 05:59:00 | 2015-08-31 19:43:05 | failed | NaN | US | NaN | NaN |
| freq | NaN | 15 | 14540 | 51130 | 229975 | 46 | 2 | 168605 | NaN | 229824 | NaN | NaN |
| mean | 1.074956e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 111.852530 | NaN | 9.293935e+03 | 4.115817e+04 |
| std | 6.194665e+08 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 996.270201 | NaN | 9.314645e+04 | 1.082761e+06 |
| min | 5.971000e+03 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.000000 | NaN | 0.000000e+00 | 1.000000e-02 |
| 25% | 5.373183e+08 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.000000 | NaN | 5.000000e+01 | 2.000000e+03 |
| 50% | 1.075897e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 15.000000 | NaN | 7.914802e+02 | 5.000000e+03 |
| 75% | 1.611414e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 62.000000 | NaN | 4.500000e+03 | 1.500000e+04 |
| max | 2.147476e+09 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 219382.000000 | NaN | 2.033899e+07 | 1.663627e+08 |
# Converting the columns into the right dtypes as for dates and numbers.
df_final["deadline"] = pd.to_datetime(df_final['deadline'])
df_final["launched"] = pd.to_datetime(df_final['launched'])
df_final["ID"] = pd.to_numeric(df_final["ID"])
# df_final["goal"] = pd.to_numeric(df_final["goal"])
# df_final["pledged"] = pd.to_numeric(df_final["pledged"])
df_final["backers"] = pd.to_numeric(df_final["backers"])
# df_final["usd pledged"] = pd.to_numeric(df_final["usd pledged"])
df_final["real_usd_pledged"] = pd.to_numeric(df_final["real_usd_pledged"])
df_final["usd_goal"] = pd.to_numeric(df_final["usd_goal"])
df_final.dtypes
ID int64 name object category object main_category object currency object deadline datetime64[ns] launched datetime64[ns] state object backers int64 country object real_usd_pledged float64 usd_goal float64 dtype: object
df_final.isnull().any()
ID False name False category False main_category False currency False deadline False launched False state False backers False country True real_usd_pledged False usd_goal False dtype: bool
df_final[df_final['country'].isnull()].head()
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1833 | 1012744036 | An Oratorio for our Time - Last Stop Cafe | Music | Music | USD | 2016-05-26 19:46:26 | 2016-04-26 19:46:26 | successful | 0 | NaN | 5170.00 | 5000.0 |
| 3670 | 1024989802 | Jackson Ruby: The Cassette Album | Music | Music | USD | 2016-05-11 19:29:17 | 2016-04-11 19:29:17 | successful | 0 | NaN | 5296.00 | 5000.0 |
| 4004 | 1027275369 | Help Parker Brown make his first solo album | Music | Music | USD | 2016-05-21 16:14:28 | 2016-04-21 16:14:28 | successful | 0 | NaN | 5077.00 | 3800.0 |
| 4180 | 1028691308 | Help BETHANY record a NEW single in Nash! | Music | Music | USD | 2016-05-08 22:37:00 | 2016-04-04 23:32:00 | successful | 0 | NaN | 3502.13 | 3500.0 |
| 6157 | 1041708793 | Serena Gabriel's first CD!!!!!!: Diving Deep | Music | Music | USD | 2016-05-01 04:22:00 | 2016-03-25 17:20:21 | successful | 0 | NaN | 3787.00 | 3500.0 |
df_final[df_final['country'].isnull()].shape
(210, 12)
Let's drop these because we can see that there is 0 backers and no country nor usd pledged previously, it seems to be a mistake in getting the data
df_final = df_final[~df_final['country'].isnull()]
df_final = df_final.loc[~((df_final['real_usd_pledged']>=df_final['usd_goal']) & (df_final['state']=='failed'))]
df_final = df_final.reset_index(drop=True)
df_final.isnull().any()
ID False name False category False main_category False currency False deadline False launched False state False backers False country False real_usd_pledged False usd_goal False dtype: bool
df_final.shape
(281637, 12)
df_final.duplicated().sum()
0
counts = df_final['name'].value_counts().rename_axis('name').reset_index(name='counts')
duplicate_names = df_final[df_final['name'].isin(counts[counts['counts']>1].name.tolist())]
duplicate_names.shape
(3022, 12)
duplicate_names.sort_values(by=['name']).head()
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1529 | 1010584633 | "A Fresh Start" | Shorts | Film & Video | USD | 2011-09-25 06:00:00 | 2011-08-28 21:27:52 | successful | 24 | US | 3000.0 | 3000.0 |
| 240126 | 713417995 | "A Fresh Start" | Documentary | Film & Video | USD | 2013-03-27 02:51:47 | 2013-01-26 03:51:47 | failed | 26 | US | 1417.0 | 5000.0 |
| 96268 | 1656736114 | "ONE" | Apps | Technology | USD | 2016-02-25 17:37:04 | 2016-01-26 17:37:04 | failed | 0 | US | 0.0 | 10000.0 |
| 86497 | 159049492 | "ONE" | Classical Music | Music | USD | 2016-09-26 03:19:12 | 2016-08-12 03:19:12 | successful | 113 | US | 10261.0 | 10000.0 |
| 281045 | 996180421 | "On The Road" | Webseries | Film & Video | USD | 2015-01-22 17:31:06 | 2014-12-23 17:31:06 | failed | 0 | US | 0.0 | 80000.0 |
I'll leave it as it is, but it's interesting to see that some duplicates seem genuine, others seem to be about the same project revamped/relaunched and others are also another rendition of the same project (play at theater and video for instance...).
It would be interesting to know more about the motives and mindset of people creating these projects 'again' (needs of funds again), are there also possible cases of reboot of past successful projects (hoax ?).
Overall, it still can be integrated in our model as we want to predict the success/failure of a campaign regardless.
def plot_cat_value_counts(df: pd.DataFrame, column: str, legend_name: str, state: str="state", num:int=-1, normalize:bool=True):
main_cats = df[column].value_counts().head(num)
main_cats_failed = df[df[state] == "failed"][column].value_counts(normalize=normalize).head(num)
main_cats_sucess = df[df[state] == "successful"][column].value_counts(normalize=normalize).head(num)
#First plot
trace0 = go.Bar(
x=main_cats_failed.index,
y=main_cats_failed.values,
name="Failed "+ legend_name
)
#Second plot
trace1 = go.Bar(
x=main_cats_sucess.index,
y=main_cats_sucess.values,
name="Successful "+ legend_name
)
#Third plot
trace2 = go.Bar(
x=main_cats.index,
y=main_cats.values,
name= legend_name +" Distribution"
)
#Creating the grid
fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
subplot_titles=('Failed','Sucessful', "General "+ legend_name), vertical_spacing = 0.4)
#setting the figs
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)
fig['layout'].update(showlegend=True,
title=column.upper().replace("_"," ")+" Distribution",
title_x=0.43,
bargap=0.1)
iplot(fig)
plot_cat_value_counts(df_final, "main_category", "Categories")
plot_cat_value_counts(df_final, "category", "Sub_categories", num = 15, normalize=False)
df_final['cat_full'] = df_final[["main_category","category"]].agg('-'.join, axis=1)
df_final[['cat_full', 'category']].describe()
| cat_full | category | |
|---|---|---|
| count | 281637 | 281637 |
| unique | 165 | 158 |
| top | Design-Product Design | Product Design |
| freq | 14538 | 14538 |
main_cats_failed = df_final[df_final["state"] == "failed"]["main_category"].value_counts()
main_cats_success = df_final[df_final["state"] == "successful"]["main_category"].value_counts()
#First plot
trace1 = go.Bar(
x=main_cats_success.index,
y=main_cats_success.values,
opacity=0.60,
name="Successful Categories"
)
#Second plot
trace2 = go.Bar(
x=main_cats_failed.index,
y=main_cats_failed.values,
opacity=0.60,
name="Failed Categories"
)
data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title=go.layout.Title(text="Failed and Success over Main categories"))
fig = go.Figure(
data=data,
layout=layout
)
iplot(fig)
main_cats_failed = df_final[df_final["state"] == "failed"]["cat_full"].value_counts()
main_cats_success = df_final[df_final["state"] == "successful"]["cat_full"].value_counts()
dico = main_cats_failed.to_dict()
for k in dico.keys():
dico[k] = (dico[k],main_cats_success[k])
more_success_than_failed = {k:v for k,v in dico.items() if v[0]<v[1]}
more_failed_than_success = {k:v for k,v in dico.items() if v[0]>=v[1]}
#First plot
trace1 = go.Bar(
x=main_cats_success[main_cats_success.index.isin(list(more_success_than_failed.keys()))].index,
y=main_cats_success[main_cats_success.index.isin(list(more_success_than_failed.keys()))].values,
opacity=0.60,
name="Successful Categories"
)
#Second plot
trace2 = go.Bar(
x=main_cats_failed[main_cats_failed.index.isin(list(more_success_than_failed.keys()))].index,
y=main_cats_failed[main_cats_failed.index.isin(list(more_success_than_failed.keys()))].values,
opacity=0.60,
name="Failed Categories"
)
data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title=go.layout.Title(text="Most success in categories"))
fig = go.Figure(
data=data,
layout=layout
)
iplot(fig)
#First plot
trace1 = go.Bar(
x=main_cats_success[main_cats_success.index.isin(list(more_failed_than_success.keys()))].index,
y=main_cats_success[main_cats_success.index.isin(list(more_failed_than_success.keys()))].values,
opacity=0.60,
name="Successful Categories"
)
#Second plot
trace2 = go.Bar(
x=main_cats_failed[main_cats_failed.index.isin(list(more_failed_than_success.keys()))].index,
y=main_cats_failed[main_cats_failed.index.isin(list(more_failed_than_success.keys()))].values,
opacity=0.60,
name="Failed Categories"
)
data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title=go.layout.Title(text="Most failed in categories"))
fig = go.Figure(
data=data,
layout=layout
)
iplot(fig)
print("The most promising categories to start a kickstarter in are:",", ".join(list(more_success_than_failed.keys())))
The most promising categories to start a kickstarter in are: Music-Music, Film & Video-Shorts, Games-Tabletop Games, Music-Rock, Theater-Theater, Comics-Comics, Music-Indie Rock, Music-Pop, Music-Country & Folk, Art-Public Art, Art-Illustration, Publishing-Art Books, Music-Classical Music, Comics-Comic Books, Dance-Dance, Music-Jazz, Comics-Graphic Novels, Theater-Plays, Theater-Musical, Technology-DIY Electronics, Dance-Performances, Comics-Webcomics, Art-Installations, Theater-Festivals, Technology-Camera Equipment, Theater-Experimental, Music-Punk, Theater-Immersive, Film & Video-Festivals, Publishing-Literary Journals, Publishing-Anthologies, Theater-Spaces, Crafts-Knitting, Comics-Anthologies, Crafts-Pottery, Design-Typography, Dance-Residencies, Crafts-Letterpress, Music-Chiptune
print("The less promising categories to start a kickstarter in are:",", ".join(list(more_failed_than_success.keys())))
The less promising categories to start a kickstarter in are: Design-Product Design, Film & Video-Documentary, Food-Food, Games-Video Games, Publishing-Fiction, Film & Video-Film & Video, Fashion-Fashion, Publishing-Nonfiction, Technology-Apps, Technology-Technology, Fashion-Apparel, Art-Art, Publishing-Children's Books, Photography-Photography, Film & Video-Webseries, Publishing-Publishing, Film & Video-Narrative Film, Music-Hip-Hop, Crafts-Crafts, Technology-Web, Technology-Software, Design-Design, Food-Restaurants, Art-Painting, Technology-Hardware, Games-Games, Art-Mixed Media, Film & Video-Animation, Food-Drinks, Fashion-Accessories, Food-Food Trucks, Technology-Gadgets, Music-Electronic Music, Games-Mobile Games, Journalism-Journalism, Art-Performance Art, Film & Video-Comedy, Design-Graphic Design, Film & Video-Drama, Food-Small Batch, Music-World Music, Art-Sculpture, Games-Playing Cards, Journalism-Web, Publishing-Poetry, Food-Farms, Art-Digital Art, Crafts-DIY, Photography-People, Photography-Photobooks, Crafts-Woodworking, Film & Video-Television, Publishing-Periodicals, Film & Video-Horror, Games-Live Games, Fashion-Jewelry, Art-Conceptual Art, Fashion-Ready-to-wear, Photography-Places, Music-Faith, Publishing-Academic, Publishing-Young Adult, Film & Video-Action, Technology-Wearables, Food-Events, Design-Architecture, Publishing-Radio & Podcasts, Journalism-Print, Fashion-Footwear, Photography-Nature, Film & Video-Thrillers, Photography-Fine Art, Music-Metal, Film & Video-Music Videos, Film & Video-Science Fiction, Film & Video-Experimental, Food-Cookbooks, Journalism-Video, Food-Farmer's Markets, Technology-3D Printing, Crafts-Candles, Fashion-Childrenswear, Technology-Flight, Food-Vegan, Music-R&B, Design-Interactive Design, Technology-Robots, Technology-Sound, Journalism-Audio, Food-Spaces, Film & Video-Family, Food-Community Gardens, Games-Gaming Hardware, Fashion-Couture, Film & Video-Fantasy, Photography-Animals, Design-Civic Design, Publishing-Zines, Technology-Space Exploration, Food-Bacon, Film & Video-Movie Theaters, Art-Textiles, Technology-Fabrication Tools, Journalism-Photo, Publishing-Calendars, Art-Ceramics, Music-Kids, Crafts-Printing, Art-Video Art, Crafts-Crochet, Technology-Makerspaces, Music-Blues, Games-Puzzles, Crafts-Stationery, Film & Video-Romance, Publishing-Translations, Dance-Spaces, Dance-Workshops, Crafts-Glass, Comics-Events, Crafts-Embroidery, Fashion-Pet Fashion, Music-Latin, Crafts-Quilts, Crafts-Weaving, Crafts-Taxidermy
import plotly.graph_objects as go
dico2 = {k:((v[0]*100)/(v[0]+v[1]),(v[1]*100)/(v[0]+v[1])) for k,v in more_success_than_failed.items()}
fig = go.Figure(data=[
go.Bar(name='Success Percent', x=list(dico2.keys()), y=[x[1] for x in list(dico2.values())]),
go.Bar(name='Failed Percent', x=list(dico2.keys()), y=[x[0] for x in list(dico2.values())])
])
# Change the bar mode
fig.update_layout(barmode='stack',title=go.layout.Title(text="Percent of most successful in categories"))
fig.show()
main_goal_cat = df_final.groupby(['cat_full', 'state'], as_index=False)['usd_goal'].mean()
main_cats_failed = main_goal_cat[main_goal_cat["state"] == "failed"][main_goal_cat.cat_full.isin(list(more_success_than_failed.keys()))]
main_cats_success = main_goal_cat[main_goal_cat["state"] == "successful"][main_goal_cat.cat_full.isin(list(more_success_than_failed.keys()))]
#First plot
trace1 = go.Bar(
x=main_cats_success.cat_full,
y=main_cats_success.usd_goal,
opacity=0.60,
name="Successful Categories"
)
#Second plot
trace2 = go.Bar(
x=main_cats_failed.cat_full,
y=main_cats_failed.usd_goal,
opacity=0.60,
name="Failed Categories"
)
data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title=go.layout.Title(text="Most success in categories"))
fig = go.Figure(
data=data,
layout=layout
)
iplot(fig)
<ipython-input-76-46785aed45c0>:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index. <ipython-input-76-46785aed45c0>:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
import plotly.graph_objects as go
dico2 = {k:((v[0]*100)/(v[0]+v[1]),(v[1]*100)/(v[0]+v[1])) for k,v in more_failed_than_success.items()}
fig = go.Figure(data=[
go.Bar(name='Success Percent', x=list(dico2.keys()), y=[x[1] for x in list(dico2.values())]),
go.Bar(name='Failed Percent', x=list(dico2.keys()), y=[x[0] for x in list(dico2.values())])
])
# Change the bar mode
fig.update_layout(barmode='stack', title=go.layout.Title(text="Percent of most failed in categories"))
fig.show()
df_final[df_final['state']=='successful'].sort_values(['backers'], ascending=False).head()
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | cat_full | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 139678 | 1955357092 | Exploding Kittens | Tabletop Games | Games | USD | 2015-02-20 03:00:00 | 2015-01-20 19:00:19 | successful | 219382 | US | 8782571.99 | 10000.0 | Games-Tabletop Games |
| 56611 | 1386523707 | Fidget Cube: A Vinyl Desk Toy | Product Design | Design | USD | 2016-10-20 03:00:00 | 2016-08-30 22:02:09 | successful | 154926 | US | 6465690.30 | 15000.0 | Design-Product Design |
| 217615 | 557230947 | Bring Reading Rainbow Back for Every Child;Eve... | Web | Technology | USD | 2014-07-02 21:00:00 | 2014-05-28 15:05:45 | successful | 105857 | US | 5408916.95 | 1000000.0 | Technology-Web |
| 110616 | 1755266685 | The Veronica Mars Movie Project | Narrative Film | Film & Video | USD | 2013-04-13 05:00:00 | 2013-03-13 15:42:22 | successful | 91585 | US | 5702153.38 | 2000000.0 | Film & Video-Narrative Film |
| 135994 | 1929840910 | Double Fine Adventure | Video Games | Games | USD | 2012-03-14 01:00:00 | 2012-02-09 02:52:52 | successful | 87142 | US | 3336371.92 | 400000.0 | Games-Video Games |
Exploding kittens is in the first place ! I really enjoyed playing this game and know that its campaign was indeed pretty popular.
df_final[df_final['state']=='successful'].sort_values(['usd_goal'], ascending=False).head()
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | cat_full | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 264321 | 880009511 | Elite: Dangerous | Video Games | Games | GBP | 2013-01-05 01:00:57 | 2012-11-06 01:00:57 | successful | 25681 | GB | 2.528259e+06 | 2.002339e+06 | Games-Video Games |
| 66632 | 1454565733 | Bring Back MYSTERY SCIENCE THEATER 3000 | Television | Film & Video | USD | 2015-12-12 07:00:00 | 2015-11-10 16:49:32 | successful | 48270 | US | 5.764229e+06 | 2.000000e+06 | Film & Video-Television |
| 274133 | 947809001 | Shenmue 3 | Video Games | Games | USD | 2015-07-18 03:51:04 | 2015-06-16 03:51:04 | successful | 69320 | US | 6.333296e+06 | 2.000000e+06 | Games-Video Games |
| 127565 | 1871494789 | WISH I WAS HERE | Narrative Film | Film & Video | USD | 2013-05-24 21:00:00 | 2013-04-24 11:57:04 | successful | 46520 | US | 3.105473e+06 | 2.000000e+06 | Film & Video-Narrative Film |
| 100051 | 1682353635 | Camelot Unchained | Video Games | Games | USD | 2013-05-02 17:56:11 | 2013-04-02 17:56:11 | successful | 14873 | US | 2.232933e+06 | 2.000000e+06 | Games-Video Games |
df_final[df_final['state']=='successful'].sort_values(['real_usd_pledged'], ascending=False).head(10)
| ID | name | category | main_category | currency | deadline | launched | state | backers | country | real_usd_pledged | usd_goal | cat_full | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 117102 | 1799979574 | Pebble Time - Awesome Smartwatch;No Compromises | Product Design | Design | USD | 2015-03-28 03:00:00 | 2015-02-24 15:44:42 | successful | 78471 | US | 20338986.27 | 500000.0 | Design-Product Design |
| 186319 | 342886736 | COOLEST COOLER: 21st Century Cooler that's Act... | Product Design | Design | USD | 2014-08-30 03:00:00 | 2014-07-08 10:14:37 | successful | 62642 | US | 13285226.36 | 50000.0 | Design-Product Design |
| 161210 | 2103598555 | Pebble 2;Time 2 + All-New Pebble Core | Product Design | Design | USD | 2016-06-30 07:00:00 | 2016-05-24 15:49:52 | successful | 66673 | US | 12779843.49 | 1000000.0 | Design-Product Design |
| 210261 | 506924864 | Pebble: E-Paper Watch for iPhone and Android | Product Design | Design | USD | 2012-05-19 05:00:00 | 2012-04-11 06:59:04 | successful | 68929 | US | 10266845.74 | 100000.0 | Design-Product Design |
| 218800 | 565687737 | The World's Best TRAVEL JACKET with 15 Feature... | Product Design | Design | USD | 2015-09-03 19:00:00 | 2015-07-07 13:52:34 | successful | 44949 | US | 9192055.66 | 20000.0 | Design-Product Design |
| 139678 | 1955357092 | Exploding Kittens | Tabletop Games | Games | USD | 2015-02-20 03:00:00 | 2015-01-20 19:00:19 | successful | 219382 | US | 8782571.99 | 10000.0 | Games-Tabletop Games |
| 4944 | 1033978702 | OUYA: A New Kind of Video Game Console | Gaming Hardware | Games | USD | 2012-08-09 07:00:00 | 2012-07-10 14:44:41 | successful | 63416 | US | 8596474.58 | 950000.0 | Games-Gaming Hardware |
| 201948 | 450099426 | The Everyday Backpack;Tote;and Sling | Product Design | Design | USD | 2016-09-10 02:00:00 | 2016-07-13 00:47:35 | successful | 26359 | US | 6565782.50 | 500000.0 | Design-Product Design |
| 56611 | 1386523707 | Fidget Cube: A Vinyl Desk Toy | Product Design | Design | USD | 2016-10-20 03:00:00 | 2016-08-30 22:02:09 | successful | 154926 | US | 6465690.30 | 15000.0 | Design-Product Design |
| 274133 | 947809001 | Shenmue 3 | Video Games | Games | USD | 2015-07-18 03:51:04 | 2015-06-16 03:51:04 | successful | 69320 | US | 6333295.77 | 2000000.0 | Games-Video Games |
It often seems to be projects that involve a BtoC rewards, product design and games category. So it would involve high goals and need a lot of backers for a successful outcome.
def plot_continuous_vars(data, column_name):
plot_dims = (14, 8)
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False, figsize=plot_dims)
sns.distplot(data[column_name], ax=ax1)
sns.distplot(np.log1p(data[column_name]), ax=ax2)
plot_continuous_vars(df_final, 'usd_goal')
C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
plot_continuous_vars(df_final, 'real_usd_pledged')
C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
We take the log to better see the distributions as we have outliers in both cases.
df_failed = df_final[df_final["state"] == "failed"]
df_sucess = df_final[df_final["state"] == "successful"]
# Add histogram data
failed = np.log(df_failed['usd_goal']+1)
success = np.log(df_sucess['usd_goal']+1)
trace1 = go.Histogram(
x=failed,
opacity=0.60, nbinsx=30, name='Goals Failed', histnorm='probability'
)
trace2 = go.Histogram(
x=success,
opacity=0.60, nbinsx=30, name='Goals Sucessful', histnorm='probability'
)
data = [trace1, trace2]
layout = go.Layout(barmode='overlay', title=go.layout.Title(text="Distributions of usd_goal"))
fig = go.Figure(
data=data,
layout=layout
)
iplot(fig)
Based on the above histogram, it seems the failed projects tend to have higher values (so higher goals)
import plotly.express as px
fig = px.box(df_final, x="main_category", y="usd_goal")
fig.show()
df_failed = df_final[df_final["state"] == "failed"]
df_success = df_final[df_final["state"] == "successful"]
plot_continuous_vars(df_failed, 'backers')
plot_continuous_vars(df_success, 'backers')
C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). C:\Users\ymaricar\anaconda3\lib\site-packages\seaborn\distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).